MARCADOR.jpg

Teste de Hipóteses e ANOVA em Dados Financeiros¶

Bibliotecas¶

In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
import math
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from scipy import stats
from scipy import optimize
import statsmodels.api as sm
from statsmodels.formula.api import ols
from bioinfokit.analys import stat
from scipy.stats import mannwhitneyu
from scipy.stats import wilcoxon
from scipy.stats import kruskal

Baixando os dados de ações da B3¶

In [2]:
acoes = ['ITUB3.SA', 'MGLU3.SA', 'CIEL3.SA', 'PETR3.SA', 'CSNA3.SA', '^BVSP']
acoes
Out[2]:
['ITUB3.SA', 'MGLU3.SA', 'CIEL3.SA', 'PETR3.SA', 'CSNA3.SA', '^BVSP']
In [3]:
acoes_df = pd.DataFrame()
for acao in acoes:
    acoes_df[acao] = yf.download(acao,
            start='2012-01-01', end='2023-01-01')['Close']
acoes_df.index = acoes_df.index.strftime('%Y-%m-%d')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
In [4]:
acoes_df.reset_index(inplace=True)
acoes_df
Out[4]:
Date ITUB3.SA MGLU3.SA CIEL3.SA PETR3.SA CSNA3.SA ^BVSP
0 2012-01-02 12.845206 0.290625 9.715390 23.209999 15.11 NaN
1 2012-01-03 13.072877 0.288750 9.751559 24.020000 15.89 59265.0
2 2012-01-04 13.195819 0.285937 9.785718 24.170000 16.00 59365.0
3 2012-01-05 12.990915 0.283437 9.645061 24.020000 15.87 58546.0
4 2012-01-06 13.182159 0.291250 9.584780 24.000000 15.83 58600.0
... ... ... ... ... ... ... ...
2724 2022-12-23 21.860001 2.680000 5.180000 28.559999 13.91 109698.0
2725 2022-12-26 21.580000 2.660000 5.130000 28.469999 14.14 108738.0
2726 2022-12-27 21.379999 2.520000 5.080000 28.660000 14.46 108347.0
2727 2022-12-28 21.930000 2.690000 5.210000 28.500000 14.79 110237.0
2728 2022-12-29 21.889999 2.740000 5.240000 28.040001 14.55 110031.0

2729 rows × 7 columns

In [5]:
dataset = acoes_df.copy()
dataset.drop(labels = ['Date'], axis=1, inplace=True)
taxas_retorno = np.log(dataset / dataset.shift(1))
dataset_date = acoes_df.copy()
date = dataset_date.filter(["Date"])
taxas_retorno = pd.concat([date, taxas_retorno], axis=1)
taxas_retorno
Out[5]:
Date ITUB3.SA MGLU3.SA CIEL3.SA PETR3.SA CSNA3.SA ^BVSP
0 2012-01-02 NaN NaN NaN NaN NaN NaN
1 2012-01-03 0.017569 -0.006473 0.003716 0.034304 0.050333 NaN
2 2012-01-04 0.009360 -0.009790 0.003497 0.006225 0.006899 0.001686
3 2012-01-05 -0.015650 -0.008782 -0.014478 -0.006225 -0.008158 -0.013892
4 2012-01-06 0.014614 0.027192 -0.006270 -0.000833 -0.002524 0.000922
... ... ... ... ... ... ... ...
2724 2022-12-23 0.013355 0.030305 0.021464 0.049894 -0.022744 0.021944
2725 2022-12-26 -0.012892 -0.007491 -0.009699 -0.003156 0.016400 -0.008790
2726 2022-12-27 -0.009311 -0.054067 -0.009794 0.006652 0.022379 -0.003602
2727 2022-12-28 0.025400 0.065282 0.025269 -0.005598 0.022565 0.017294
2728 2022-12-29 -0.001826 0.018417 0.005742 -0.016272 -0.016360 -0.001870

2729 rows × 7 columns

In [6]:
figura = px.line(title = 'Histórico de retorno das ações')
for i in taxas_retorno.columns[1:]:
  figura.add_scatter(x = taxas_retorno["Date"] ,y = taxas_retorno[i], name = i)
figura.show()
20142016201820202022−0.3−0.2−0.100.10.20.3
ITUB3.SAMGLU3.SACIEL3.SAPETR3.SACSNA3.SA^BVSPHistórico de retorno das ações
plotly-logomark

Montando Carteiras de Ativos¶

In [7]:
acoes_port = acoes_df.copy()
acoes_port.drop(labels = ['^BVSP'], axis=1, inplace=True)
log_ret = acoes_port.copy()
log_ret.drop(labels = ["Date"], axis = 1, inplace = True)
log_ret = np.log(log_ret/log_ret.shift(1))
np.random.seed(42)
num_ports = 10000
all_weights = np.zeros((num_ports, len(acoes_port.columns[1:])))
ret_arr = np.zeros(num_ports)
vol_arr = np.zeros(num_ports)
sharpe_arr = np.zeros(num_ports)

for x in range(num_ports):
    # Weights
    weights = np.array(np.random.random(5))
    weights = weights/np.sum(weights)
    
    # Save weights
    all_weights[x,:] = weights
    
    # Expected return
    ret_arr[x] = np.sum((log_ret.mean() * weights))
    
    # Expected volatility
    vol_arr[x] = np.sqrt(np.dot(weights.T, np.dot(log_ret.cov(), weights)))
    
    # Sharpe Ratio
    sharpe_arr[x] = ret_arr[x]/vol_arr[x]
In [8]:
print("Max Sharpe Ratio: {}". format(sharpe_arr.max()))
print("Local do Max Sharpe Ratio: {}". format(sharpe_arr.argmax()))
Max Sharpe Ratio: 0.021130418245689463
Local do Max Sharpe Ratio: 7063
In [9]:
# Pesos do Portfólio do Max Sharpe Ratio
print(all_weights[7063,:])
[0.32020505 0.62806883 0.01391511 0.01269245 0.02511856]
In [10]:
taxas_retorno["CARTEIRA"] = (0.32020505*taxas_retorno["ITUB3.SA"] + 0.62806883*taxas_retorno["MGLU3.SA"] + 
                                   0.01391511*taxas_retorno["CIEL3.SA"] + 0.01269245*taxas_retorno["PETR3.SA"] + 
                                   0.02511856*taxas_retorno["CSNA3.SA"])
taxas_retorno
Out[10]:
Date ITUB3.SA MGLU3.SA CIEL3.SA PETR3.SA CSNA3.SA ^BVSP CARTEIRA
0 2012-01-02 NaN NaN NaN NaN NaN NaN NaN
1 2012-01-03 0.017569 -0.006473 0.003716 0.034304 0.050333 NaN 0.003312
2 2012-01-04 0.009360 -0.009790 0.003497 0.006225 0.006899 0.001686 -0.002850
3 2012-01-05 -0.015650 -0.008782 -0.014478 -0.006225 -0.008158 -0.013892 -0.011012
4 2012-01-06 0.014614 0.027192 -0.006270 -0.000833 -0.002524 0.000922 0.021597
... ... ... ... ... ... ... ... ...
2724 2022-12-23 0.013355 0.030305 0.021464 0.049894 -0.022744 0.021944 0.023671
2725 2022-12-26 -0.012892 -0.007491 -0.009699 -0.003156 0.016400 -0.008790 -0.008596
2726 2022-12-27 -0.009311 -0.054067 -0.009794 0.006652 0.022379 -0.003602 -0.036429
2727 2022-12-28 0.025400 0.065282 0.025269 -0.005598 0.022565 0.017294 0.049982
2728 2022-12-29 -0.001826 0.018417 0.005742 -0.016272 -0.016360 -0.001870 0.010445

2729 rows × 8 columns

In [11]:
acoes_port = acoes_df.copy()
acoes_port.drop(labels = ['^BVSP'], axis=1, inplace=True)
log_ret = acoes_port.copy()
log_ret.drop(labels = ["Date"], axis = 1, inplace = True)
log_ret = np.log(log_ret/log_ret.shift(1))
np.random.seed(123)
num_ports = 10000
all_weights = np.zeros((num_ports, len(acoes_port.columns[1:])))
ret_arr = np.zeros(num_ports)
vol_arr = np.zeros(num_ports)
sharpe_arr = np.zeros(num_ports)

for x in range(num_ports):
    # Weights
    weights = np.array(np.random.random(5))
    weights = weights/np.sum(weights)
    
    # Save weights
    all_weights[x,:] = weights
    
    # Expected return
    ret_arr[x] = np.sum((log_ret.mean() * weights))
    
    # Expected volatility
    vol_arr[x] = np.sqrt(np.dot(weights.T, np.dot(log_ret.cov(), weights)))
    
    # Sharpe Ratio
    sharpe_arr[x] = ret_arr[x]/vol_arr[x]
In [12]:
print("Max Sharpe Ratio: {}". format(sharpe_arr.max()))
print("Local do Max Sharpe Ratio: {}". format(sharpe_arr.argmax()))
Max Sharpe Ratio: 0.02089881321285652
Local do Max Sharpe Ratio: 7547
In [13]:
# Pesos do Portfólio do Max Sharpe Ratio
print(all_weights[7547,:])
[0.27851125 0.6314676  0.01505158 0.06325824 0.01171134]
In [14]:
taxas_retorno["CARTEIRA_NOVA"] = (0.27851125*taxas_retorno["ITUB3.SA"] + 0.6314676*taxas_retorno["MGLU3.SA"] + 
                                   0.01505158*taxas_retorno["CIEL3.SA"] + 0.06325824*taxas_retorno["PETR3.SA"] + 
                                   0.01171134*taxas_retorno["CSNA3.SA"])
taxas_retorno
Out[14]:
Date ITUB3.SA MGLU3.SA CIEL3.SA PETR3.SA CSNA3.SA ^BVSP CARTEIRA CARTEIRA_NOVA
0 2012-01-02 NaN NaN NaN NaN NaN NaN NaN NaN
1 2012-01-03 0.017569 -0.006473 0.003716 0.034304 0.050333 NaN 0.003312 0.003621
2 2012-01-04 0.009360 -0.009790 0.003497 0.006225 0.006899 0.001686 -0.002850 -0.003048
3 2012-01-05 -0.015650 -0.008782 -0.014478 -0.006225 -0.008158 -0.013892 -0.011012 -0.010611
4 2012-01-06 0.014614 0.027192 -0.006270 -0.000833 -0.002524 0.000922 0.021597 0.021064
... ... ... ... ... ... ... ... ... ...
2724 2022-12-23 0.013355 0.030305 0.021464 0.049894 -0.022744 0.021944 0.023671 0.026069
2725 2022-12-26 -0.012892 -0.007491 -0.009699 -0.003156 0.016400 -0.008790 -0.008596 -0.008474
2726 2022-12-27 -0.009311 -0.054067 -0.009794 0.006652 0.022379 -0.003602 -0.036429 -0.036200
2727 2022-12-28 0.025400 0.065282 0.025269 -0.005598 0.022565 0.017294 0.049982 0.048588
2728 2022-12-29 -0.001826 0.018417 0.005742 -0.016272 -0.016360 -0.001870 0.010445 0.009987

2729 rows × 9 columns

In [15]:
taxas_retorno_port = taxas_retorno.filter(["Date", "CARTEIRA", "CARTEIRA_NOVA", "^BVSP"])
taxas_retorno_port
Out[15]:
Date CARTEIRA CARTEIRA_NOVA ^BVSP
0 2012-01-02 NaN NaN NaN
1 2012-01-03 0.003312 0.003621 NaN
2 2012-01-04 -0.002850 -0.003048 0.001686
3 2012-01-05 -0.011012 -0.010611 -0.013892
4 2012-01-06 0.021597 0.021064 0.000922
... ... ... ... ...
2724 2022-12-23 0.023671 0.026069 0.021944
2725 2022-12-26 -0.008596 -0.008474 -0.008790
2726 2022-12-27 -0.036429 -0.036200 -0.003602
2727 2022-12-28 0.049982 0.048588 0.017294
2728 2022-12-29 0.010445 0.009987 -0.001870

2729 rows × 4 columns

In [16]:
figura = px.line(title = 'Comparação de retorno Carteira x Ibovespa')
for i in taxas_retorno_port.columns[1:]:
  figura.add_scatter(x = taxas_retorno_port["Date"] ,y = taxas_retorno_port[i], name = i)
figura.add_hline(y = taxas_retorno_port['CARTEIRA'].mean(), line_color="blue", line_dash="dot", )
figura.add_hline(y = taxas_retorno_port['CARTEIRA_NOVA'].mean(), line_color="red", line_dash="dot", )
figura.show()
20142016201820202022−0.2−0.15−0.1−0.0500.050.10.150.2
CARTEIRACARTEIRA_NOVA^BVSPComparação de retorno Carteira x Ibovespa
plotly-logomark

ANOVA e Teste de Hipótese¶

Assumindo Normalidade¶

In [17]:
taxa_port = taxas_retorno_port.copy()
taxa_port.drop(labels = ['Date'], axis=1, inplace=True)
taxa_port = taxa_port.dropna()
taxa_port
Out[17]:
CARTEIRA CARTEIRA_NOVA ^BVSP
2 -0.002850 -0.003048 0.001686
3 -0.011012 -0.010611 -0.013892
4 0.021597 0.021064 0.000922
5 -0.005678 -0.004720 0.008209
6 -0.005269 -0.005444 0.012163
... ... ... ...
2724 0.023671 0.026069 0.021944
2725 -0.008596 -0.008474 -0.008790
2726 -0.036429 -0.036200 -0.003602
2727 0.049982 0.048588 0.017294
2728 0.010445 0.009987 -0.001870

2711 rows × 3 columns

In [18]:
plt.figure(figsize=(12,8))
sns.boxplot(x="variable", y="value", data=pd.melt(taxa_port))
plt.show()
In [19]:
stats.probplot(taxas_retorno_port["CARTEIRA"], dist="norm", plot=plt)
plt.show()
stats.probplot(taxas_retorno_port["CARTEIRA_NOVA"], dist="norm", plot=plt)
plt.show()
stats.probplot(taxas_retorno_port["^BVSP"], dist="norm", plot=plt)
plt.show()
In [20]:
# Ordinary Least Squares (OLS) model
model = ols('value ~ variable', data=pd.melt(taxa_port)).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
Out[20]:
sum_sq df F PR(>F)
variable 0.000264 2.0 0.226356 0.797439
Residual 4.748675 8130.0 NaN NaN
In [21]:
res = stat()
res.anova_stat(pd.melt(taxa_port), res_var='value', anova_model='value ~ variable')
res.anova_summary
Out[21]:
df sum_sq mean_sq F PR(>F)
variable 2.0 0.000264 0.000132 0.226356 0.797439
Residual 8130.0 4.748675 0.000584 NaN NaN
In [22]:
res = stat()
res.tukey_hsd(pd.melt(taxa_port), res_var='value', xfac_var='variable', anova_model='value ~ variable')
res.tukey_summary
/home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/bioinfokit/analys.py:402: FutureWarning:

The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.

/home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/bioinfokit/analys.py:402: FutureWarning:

The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.

/home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/bioinfokit/analys.py:402: FutureWarning:

The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.

Out[22]:
group1 group2 Diff Lower Upper q-value p-value
0 CARTEIRA CARTEIRA_NOVA 0.000001 -0.001538 0.001540 0.002530 0.900000
1 CARTEIRA ^BVSP 0.000383 -0.001156 0.001922 0.825318 0.809700
2 CARTEIRA_NOVA ^BVSP 0.000382 -0.001157 0.001921 0.822787 0.810724
In [23]:
sm.qqplot(res.anova_std_residuals, line='45')
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Standardized Residuals")
plt.show()
In [24]:
plt.hist(res.anova_model_out.resid, bins='auto', histtype='bar', ec='k') 
plt.xlabel("Residuals")
plt.ylabel('Frequency')
plt.show()
In [25]:
w, pvalue = stats.shapiro(model.resid)
print('Statistics= %.3f, pvalue=%.3f' % (w, pvalue))
# interpret
alpha = 0.05
if pvalue > alpha:
	print('Resíduo com Distribuição Normal (Não rejeita H0)')
else:
	print('Diferente da Normal (rejeita H0)')
Statistics= 0.919, pvalue=0.000
Diferente da Normal (rejeita H0)
/home/joaogabrielsouza/anaconda3/lib/python3.10/site-packages/scipy/stats/_morestats.py:1816: UserWarning:

p-value may not be accurate for N > 5000.

In [26]:
from scipy import stats

# perform the Kolmogorov-Smirnov test on the residuals
k_statistic, pvalue = stats.kstest(model.resid, 'norm')

# print the results and interpret
alpha = 0.05
if pvalue > alpha:
    print('Resíduo com Distribuição Normal (Não rejeita H0)')
else:
    print('Diferente da Normal (rejeita H0)')

print('K-S Statistic= %.3f, pvalue=%.3f' % (k_statistic, pvalue))
Diferente da Normal (rejeita H0)
K-S Statistic= 0.465, pvalue=0.000
In [27]:
w, pvalue = stats.bartlett(taxa_port['CARTEIRA'], taxa_port['^BVSP'])
print('Statistics= %.3f, pvalue=%.3f' % (w, pvalue))
# interpret
alpha = 0.05
if pvalue > alpha:
	print('Same Variance (Não rejeita H0)')
else:
	print('Different Variance (rejeita H0)')
Statistics= 778.219, pvalue=0.000
Different Variance (rejeita H0)
In [28]:
res = stat()
res.bartlett(pd.melt(taxa_port), res_var='value', xfac_var='variable')
res.bartlett_summary
Out[28]:
Parameter Value
0 Test statistics (T) 944.5239
1 Degrees of freedom (Df) 2.0000
2 p value 0.0000
In [29]:
res = stat()
res.levene(pd.melt(taxa_port), res_var='value', xfac_var='variable')
res.levene_summary
Out[29]:
Parameter Value
0 Test statistics (W) 198.9781
1 Degrees of freedom (Df) 2.0000
2 p value 0.0000

Não paramétrico Testes -> Mann-Whitney U Test¶

Não há Normalidade¶

In [30]:
stats, p = mannwhitneyu(taxa_port["CARTEIRA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (Não rejeita H0)')
else:
	print('Different distribution (rejeita H0)')
Statistics= 3672252.000, p=0.965
Same distribution (Não rejeita H0)
In [31]:
stats, p = mannwhitneyu(taxa_port["CARTEIRA_NOVA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (Não rejeita H0)')
else:
	print('Different distribution (rejeita H0)')
Statistics= 3675220.000, p=0.994
Same distribution (Não rejeita H0)
In [32]:
stats, p = mannwhitneyu(taxa_port["CARTEIRA_NOVA"], taxa_port["CARTEIRA"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (Não rejeita H0)')
else:
	print('Different distribution (rejeita H0)')
Statistics= 3676389.000, p=0.977
Same distribution (Não rejeita H0)

Não paramétrico Testes -> Wilcoxon signed-rank Test¶

In [33]:
stats, p = wilcoxon(taxa_port["CARTEIRA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (Não rejeita H0)')
else:
	print('Different distribution (rejeita H0)')
Statistics= 1817344.000, p=0.635
Same distribution (Não rejeita H0)
In [34]:
stats, p = wilcoxon(taxa_port["CARTEIRA_NOVA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (Não rejeita H0)')
else:
	print('Different distribution (rejeita H0)')
Statistics= 1815236.000, p=0.598
Same distribution (Não rejeita H0)
In [35]:
stats, p = wilcoxon(taxa_port["CARTEIRA_NOVA"], taxa_port["CARTEIRA"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (Não rejeita H0)')
else:
	print('Different distribution (rejeita H0)')
Statistics= 1820993.000, p=0.775
Same distribution (Não rejeita H0)

Não paramétrico Testes -> Kruskal-Wallis Test¶

In [36]:
stats, p = kruskal(taxa_port["CARTEIRA_NOVA"], taxa_port["CARTEIRA"], taxa_port["^BVSP"])
print('Statistics= %.3f, p=%.3f' % (stats, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (Não rejeita H0)')
else:
	print('Different distribution (rejeita H0)')
Statistics= 0.002, p=0.999
Same distribution (Não rejeita H0)